# Code chunk settings
knitr::opts_chunk$set(echo = TRUE)

Business Task

To increase company revenue by developing a marketing strategy that will convert Cyclistic’s casual riders into annual members.

Goal of Analysis

To identify key trends and insights into how Cyclistic members and casual riders use Cyclistic bikes differently.

Data Sources

One year of Cyclistic’s bicycle user data has been provided for analysis. This data was provided in the form of twelve individual CSV files. Each file represents one month of user data spanning from July 2022 through June 2023.
Source: https://divvy-tripdata.s3.amazonaws.com/index.html

Bicycle station location data was obtained through the City of Chicago Data Portal. This data set claims to include the GPS coordinates of all Divvy (Cyclistic) bicycle stations in the City of Chicago.
Source: https://data.cityofchicago.org/Transportation/Divvy-Bicycle-Stations/bbyy-e7gq/data

Additional station location data was verified and compiled manually for stations that exist in Cyclistic’s trip data but did not exist in the public listing of station locations obtained from the City of Chicago Data Portal.
Source: data/stations_additional/stations_additional.csv

Packages

# Load packages
library(tidyverse) # for many data analysis tools
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales) # for modifying units of measure
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(sf) # for working with geospatial data & maps
## Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE
library(ggmap) # for more detailed background maps
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
##      (status 2 uses the sf package in place of rgdal)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(plotly) # for better visualization of maps
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggmap':
## 
##     wind
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(osmdata) # for working with openstreetmap.org
## Data (c) OpenStreetMap contributors, ODbL 1.0. https://www.openstreetmap.org/copyright
library(janitor) # for cleaning data
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggthemes) # for more themes (including theme_map())

Variables

# Set variables
color_member <- "#00204D"
color_casual <- "#6699FF"

Cleaning & Manipulation of Data

# Load the past 12 months of Cyclistic's historical data into data frames
july <- read.csv("data/202207-divvy-tripdata.csv")
august <- read.csv("data/202208-divvy-tripdata.csv")
september <- read.csv("data/202209-divvy-publictripdata.csv")
october <- read.csv("data/202210-divvy-tripdata.csv")
november <- read.csv("data/202211-divvy-tripdata.csv")
december <- read.csv("data/202212-divvy-tripdata.csv")
january <- read.csv("data/202301-divvy-tripdata.csv")
february <- read.csv("data/202302-divvy-tripdata.csv")
march <- read.csv("data/202303-divvy-tripdata.csv")
april <- read.csv("data/202304-divvy-tripdata.csv")
may <- read.csv("data/202305-divvy-tripdata.csv")
june <- read.csv("data/202306-divvy-tripdata.csv")


# Combine the 12 historical data frames (months) into a single data frame (year)
data <- rbind(july, august, september, october, november, december, january, february, march, april, may, june)


# Create a dataframe of the unmodified data set
data_unclean <- data


# Remove monthly data from environment
rm(january, february, march, april, may, june, july, august, september, october, november, december)


# Check for duplicate ride id's
has_duplicates <- length(data$ride_id) != length(unique(data$ride_id))
print(has_duplicates)
## [1] FALSE
# Check for NA values
na_counts <- colSums(is.na(data))
print(na_counts)
##            ride_id      rideable_type         started_at           ended_at 
##                  0                  0                  0                  0 
## start_station_name   start_station_id   end_station_name     end_station_id 
##                  0                  0                  0                  0 
##          start_lat          start_lng            end_lat            end_lng 
##                  0                  0               5795               5795 
##      member_casual 
##                  0
# Drop rows with missing ending coordinates (these show excessive ride time)
data <- data %>% drop_na(c(end_lat, end_lng))


# Check for empty string values
char_vars <- data %>% 
select_if(is.character)
empty_strings <- function(vec) {
  sum(vec == "", na.rm = TRUE)
}
char_vars <- sapply(char_vars, empty_strings)
print(char_vars)
##            ride_id      rideable_type         started_at           ended_at 
##                  0                  0                  0                  0 
## start_station_name   start_station_id   end_station_name     end_station_id 
##             857860             857992             909870             910011 
##      member_casual 
##                  0
# Convert empty strings and zero values to NA
data$start_station_name[data$start_station_name == ""] = NA
data$end_station_name[data$end_station_name == ""] = NA
data$start_station_id[data$start_station_id == ""] = NA
data$end_station_id[data$end_station_id == ""] = NA


# Convert 'zero' coordinate values to NA
data$end_lat[data$end_lat == 0] = NA
data$end_lng[data$end_lng == 0] = NA


# Trim excess whitespace from strings
data$start_station_name <- trimws(gsub("\\s+", " ", data$start_station_name))
data$end_station_name <- trimws(gsub("\\s+", " ", data$end_station_name))
data$start_station_id <- trimws(gsub("\\s+", " ", data$start_station_id))
data$end_station_id <- trimws(gsub("\\s+", " ", data$end_station_id))
data$ride_id <- trimws(gsub("\\s+", " ", data$ride_id))
data$rideable_type <- trimws(gsub("\\s+", " ", data$rideable_type))


# Drop rows if start time is later than end time (impossible scenario)
data <- data %>% filter(!started_at >= ended_at)


# Convert started_at and ended_at columns to datetime format
data <- mutate(data, started_at = ymd_hms(started_at))
data <- mutate(data, ended_at = ymd_hms(ended_at))


# Remove underscore in rideable_type column values and convert to title case
data$rideable_type <- str_to_title(str_replace(data$rideable_type, "_", " "))


# Convert member_casual column to title case
data$member_casual <- str_to_title(data$member_casual)


# Create ride_length column and convert to minutes
data <- mutate(data, ride_length = round(difftime(ended_at, started_at, units  = "mins"), digits = 2))


# Remove rows with ride_length less than 1 minute
# This helps exclude observations of "false rides" (i.e. someone is un-docking and immediately re-docking the bike)
data <- filter(data, ride_length >= 1.00)


# Create a column for day of week that the ride started
data <- mutate(data, day_of_week = weekdays(started_at, abbreviate = F))


# Create column for month that the ride started
data <- mutate(data, month = month(started_at, label = T))


# Create a function to determine the phase of day
day_phase <- function(time) {
  hr <- lubridate::hour(time)
  dplyr::case_when(hr > 5 & hr < 12 ~ 'Morning',
                   hr >= 12 & hr < 18 ~ 'Afternoon',
                   hr >=  18 & hr <= 23 ~ 'Evening',
                   TRUE ~ 'Night')
}

# Create column indicating the phase of the day (morning, afternoon, evening, night)
data <- mutate(data, phase_of_day = day_phase(started_at))


# import Divvy's complete station list
# source: https://data.cityofchicago.org/Transportation/Divvy-Bicycle-Stations-All-Map/bk89-9dk7
stations_divvy <- read.csv("data/bike-stations_city-of-chicago/Divvy_Bicycle_Stations_-_All_-_Map.csv") %>% 
  select(Station.Name, Latitude, Longitude)

# import additional stations not included in City of Chicago Divvy station listing
# These locations were individually collected by analyzing Google Maps and the Divvy station list
stations_additional <- read.csv("data/stations_additional/stations_additional.csv")


# combine station lists
stations <- rbind(stations_divvy, stations_additional)


# Trim white space in station list
stations$Station.Name <- trimws(gsub("\\s+", " ", stations$Station.Name))


# Create start_stations data frame & rename variables (for future join)
start_stations <- stations %>%
  rename(start_station_name = Station.Name,
         start_station_lat = Latitude,
         start_station_lng = Longitude)


# Create end_stations data frame & rename variables (for future join)
end_stations <- stations %>%
  rename(end_station_name = Station.Name,
         end_station_lat = Latitude,
         end_station_lng = Longitude)


# Join station data with main data set
data <- left_join(data, start_stations)
## Joining with `by = join_by(start_station_name)`
data <- left_join(data, end_stations)
## Joining with `by = join_by(end_station_name)`
# Remove end_stations and start_stations data sets from environment now that join is complete
rm(start_stations, end_stations)

Issues with the data

Missing start station names and/or end station names

data %>%
  select(start_station_name, end_station_name) %>% 
  filter(is.na(start_station_name) | is.na(end_station_name)) %>%
  summarise(
    rides = n()
  )
##     rides
## 1 1305944

There are over 1.3 million rides, roughly 23%, that do not include a start station name or end station name. These observations will not be included in the portions of this analysis involving station names or station location. However, in an effort to use as much of the data as possible, other variables from these observations will still be utilized. For example, average ride length, a statistic that isn’t relevant to station name or station location, will be calculated using these incomplete observations.

Truncated coordinates

# Rides with truncated coordinates
data %>% 
  select(start_station_name, start_lat, start_lng, end_station_name, end_lat, end_lng) %>% 
  filter(
    (nchar(start_lat) <= 5 & nchar(start_lng) <=6) |
    (nchar(end_lat) <= 5 & nchar(end_lng) <=6)
  ) %>%
  summarise(
    truncated_rides = n()
  )
##   truncated_rides
## 1         1403674

Over 1.4 million rides (about 25% of the data set) contains truncated geographic coordinates. These truncations shorten the numeric representing latitude and/or longitude from 5 or more decimal places to 2 decimal places. For example, a latitude of 41.91964 may be represented as 41.91, thus reducing the accuracy of the station location on maps by several city blocks.

# Quantity of truncated coordinates by bicycle type
data %>% 
  filter(
    (nchar(start_lat) <= 5 & nchar(start_lng) <=6) |
    (nchar(end_lat) <= 5 & nchar(end_lng) <=6)
  ) %>% 
  group_by(rideable_type) %>% 
  summarise(rides = n())
## # A tibble: 2 × 2
##   rideable_type   rides
##   <chr>           <int>
## 1 Classic Bike     3573
## 2 Electric Bike 1400101

A wide majority of coordinate truncations occur on electric bikes.

# ride start coordinates variable
trunc_coords_start <- data %>%
  select(start_lat, start_lng) %>% 
  filter((nchar(start_lat) <= 5 & nchar(start_lng) <=6)) %>% 
  group_by(start_lat, start_lng) %>% 
  summarise(n = n(), .groups = 'drop')

# ride end coordinates variable
trunc_coords_end <- data %>% 
  select(end_lat, end_lng) %>% 
  filter((nchar(end_lat) <= 5 & nchar(end_lng) <=6)) %>%
  group_by(end_lat, end_lng) %>% 
  summarise(n = n(), .groups = 'drop')

# Set bounding box & import map
bbox = c(top = 42.1196, right = -87.3097, bottom = 41.5672, left = -88.1007)
import_map <- get_map(bbox, maptype = "terrain", source = "stamen", zoom = 10)
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# plot truncated coordinates on map
ggmap(import_map) +
  geom_point(data = trunc_coords_start, mapping = aes(x = start_lng, y = start_lat), size = 1, color = "navyblue") +
  geom_point(data = trunc_coords_end, mapping = aes(x = end_lng, y = end_lat), size = 1, color = "navyblue") +
   scale_color_distiller(palette = 1, direction = 1) +
  labs(title = "Truncated station coordinates", subtitle = "Present for over 1.4M rides", caption = "") +
  theme_map()
## Warning: Removed 10 rows containing missing values (`geom_point()`).

The truncated coordinates appear in a perfect grid when viewed over a map of the Chicago area.

Inaccurate station coordinates in Cyclistic dataset

start_coordinates <- data %>% 
  select(start_station_name, start_lat, start_lng) %>% 
  filter(start_station_name == "Streeter Dr & Grand Ave")

bbox = c(top = 42.1196, right = -87.3097, bottom = 41.5672, left = -88.1007)
import_map <- get_map(bbox, maptype = "terrain", source = "stamen", zoom = 10)
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
ggmap(import_map) +
  geom_point(data = start_coordinates, mapping = aes(x = start_lng, y = start_lat), color = "darkred", alpha = 0.45) +
  labs(title = "Inaccurate station coordinates", subtitle = "Streeter Dr & Grand Ave", caption = "") + 
  theme_map() +
  theme(plot.title = element_text(size = 13.5))

In many cases, the coordinates provided for individual station names appear scattered when plotted on a map of Chicago. This issue, compounded by truncated station coordinate data, calls the reliability of the provided station coordinates into question. Because these inaccuracies constitute such a large portion of the Cyclistic data set, station coordinates will instead be referenced from an outside source. The majority of station coordinates will be referenced from the Chicago Data Portal’s listing of Divvy (Cyclistic) bike station locations.

bbox = c(top = 42.1196, right = -87.3097, bottom = 41.5672, left = -88.1007)
import_map <- get_map(bbox, maptype = "terrain", source = "stamen", zoom = 10)
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
ggmap(import_map) +
  geom_point(data = stations_divvy, mapping = aes(x = Longitude, y = Latitude), size = 1, color = "darkgreen", alpha = 0.75) +
  labs(title = "Publicly available Divvy station listing", caption = "Data provided by Chicago Data Portal") +
  theme_map()

There are several stations that are not listed in Chicago’s Data Portal but do exist in the Cyclistic ride share data. There are also several stations with more than one station name. These additional stations and coodinates have been verified and compiled manually. The coordinates of these additional stations, along with the coordinates of Chicago’s Divvy station listing, will be used to reference station location throughout this study.

Corrected geo-coordinates

coordinates <- data %>% filter(start_station_name == "Streeter Dr & Grand Ave")

bbox = c(top = 42.1196, right = -87.3097, bottom = 41.5672, left = -88.1007)
import_map <- get_map(bbox, maptype = "terrain", source = "stamen", zoom = 10)
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
ggmap(import_map) +
  geom_point(data = coordinates, mapping = aes(x = start_station_lng, y = start_station_lat), color = "darkred", alpha = 0.45) +
  labs(title = "Corrected station coordinates", subtitle = "Streeter Dr & Grand Ave", caption = "") + 
  theme_map() +
  theme(plot.title = element_text(size = 13.5))

Summary of Analysis

Who is riding Cyclistic bikes?

Total rides

data %>% 
  group_by(member_casual) %>% 
  summarise(n = n()) %>% 
  ggplot() +
    aes(x = member_casual, y = n, fill = member_casual) +
    geom_bar(position='dodge', stat='identity') +
    scale_y_continuous(label = label_number(scale_cut = cut_short_scale())) +
    scale_fill_manual(values = c(Casual = color_casual, Member = color_member)) +
    geom_text(aes(label = format(n, big.mark = ",")), vjust = -0.5, size = 3.5, position = position_dodge(0.9)) +
    labs(x = "", y = "Rides", title = "Total rides") +
    theme_classic() +
    theme(legend.position = "none")

Rides performed by casual riders made up roughly 36% of the selected data set. Member rides constitute about 64% of all rides analyzed.

data %>% 
  group_by(member_casual) %>%
  summarise(avg_ride_length = mean(as.numeric(ride_length), na.rm=T)) %>% 
  ggplot() +
  aes(x = member_casual, y = avg_ride_length, fill = member_casual) +
  geom_bar(position='dodge', stat='identity') +
  scale_fill_manual(values = c(Casual = color_casual, Member = color_member)) +
  geom_text(aes(label = round(avg_ride_length, 1)), vjust = -0.5, size = 3.5, position = position_dodge(0.9)) +
  labs(x = "", y = "Average Ride Length (Mins)", title = "Average Ride Length (Minutes)", subtitle = "") +
  theme_classic() +
  theme(legend.position = "none") 

Casual riders tend to ride longer with an average ride length more than eight minutes greater than that of the average member ride.

What kinds of bikes are they riding?

data %>% 
  drop_na(rideable_type, member_casual) %>% 
  group_by(rideable_type, member_casual) %>% 
  summarize(quantity = n(), .groups = 'drop') %>% 
  ggplot() +
    aes(x = rideable_type, y = quantity, fill = member_casual) +
    geom_bar(position='dodge', stat='identity') +
    scale_y_continuous(label = label_number(scale_cut = cut_short_scale())) +
    scale_fill_manual(values = c(Casual = color_casual, Member = color_member)) +
    labs(x = "", y = "Rides", title = "Rides by bike type") +
    theme_classic() +
    theme(legend.title = element_blank()) 

The data shows that both casuals and members ride electric bikes more often than classic bikes.

Docked bikes

The data set includes three possible values for the “rideable_type” variable: electric bike, classic bike or docked bike. The “docked bike” value is included in 138,910 observations and should be discussed with Cyclistic’s technical team in order to better understand what this data represents.

data %>% 
  drop_na(rideable_type, member_casual) %>% 
  group_by(rideable_type, member_casual) %>%
  summarise(avg_ride = mean(as.numeric(ride_length), na.rm = T), .groups = 'drop') %>% 
  ggplot() +
    aes(x = rideable_type, y = avg_ride, fill = member_casual) +
    geom_bar(position='dodge', stat='identity') +
    scale_fill_manual(values = c(Casual = color_casual, Member = color_member)) +
    geom_text(aes(label = round(avg_ride, 1)), vjust = -0.5, size = 3.5, position = position_dodge(0.9)) +
    labs(x = "", y = "Average Ride Length (Minutes)", title = "Average ride length by bike type", subtitle = "") + 
    theme_classic() +
    theme(legend.title = element_blank())

While the “docked bike” designation is unique to Casual riders. The average ride length for docked bikes is notably higher than the average of electric or classic bikes. Docked bikes aside, we see the longest rides on average being taken by casuals on classic bikes.

When are they riding?

data %>% 
ggplot() +
 aes(x = factor(month, level=c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')), fill = member_casual) +
 geom_bar(position = "dodge") +
 scale_y_continuous(label = label_number_si()) +
 scale_fill_manual(values = c(Casual = color_casual, Member = color_member)) +
 labs(x = "", y = "Rides", title = "Rides per month", subtitle = "") +
  theme_classic() +
  theme(legend.title = element_blank())
## Warning: `label_number_si()` was deprecated in scales 1.2.0.
## ℹ Please use the `scale_cut` argument of `label_number()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

A look at ride quantity on a monthly basis shows that ridership for both members and casuals follows a similar pattern. Both groups ride more often during the warmer summer months versus the colder winter months. However, casual riders exhibit a greater degree of seasonal fluctuation as they account for just a small fraction of rides during the winter months while riding much more often during the summer and nearly as much as members in July.

ggplot(data) +
  aes(x = factor(day_of_week, level=c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')), fill = member_casual) +
  geom_bar(position = "dodge") +
  scale_y_continuous(label = label_number_si()) +
  scale_fill_manual(values = c(Casual = color_casual, Member = color_member)) +
  labs(x = "", y = "Rides", title = "Rides by day of week", subtitle = "") +
  theme_classic() +
  theme(legend.title = element_blank())

Members tend to ride most often during the middle of the work week (Tue, Wed, Thurs), while casual riders are partial to the the weekend with Friday, Saturday and Sunday as the most popular days to ride.

ggplot(data) +
  aes(x = factor(phase_of_day, level=c("Morning", "Afternoon", "Evening", "Night")), fill = member_casual) +
  geom_bar(position = "dodge") +
  scale_y_continuous(label = label_number(scale_cut = cut_short_scale())) +
  scale_fill_manual(values = c(Casual = color_casual, Member = color_member)) +
  labs(x = "", y = "Rides", title = "Rides by Time of Day", subtitle = "") +
  theme_classic() +
  theme(legend.title = element_blank()) 

When we view the number of rides by time of day we can see that the afternoon (12pm - 6pm) is the most popular time to ride for both groups. Casuals appear to ride less in the morning but proportionately more than members at night (12am - 6am) when they ride nearly as much as members but account for a smaller percentage of total rides.

data %>% 
  group_by(phase_of_day, member_casual) %>%
  summarise(
    avg_ride = mean(as.numeric(ride_length), na.rm = T),
    .groups = 'drop'
  ) %>% 
  ggplot() +
    aes(x = factor(phase_of_day, level=c("Morning", "Afternoon", "Evening", "Night")), y = avg_ride, fill = member_casual) +
    geom_bar(position='dodge', stat='identity') +
    scale_fill_manual(values = c(Casual = color_casual, Member = color_member)) +
    geom_text(aes(label= round(avg_ride, 1)), vjust = -0.5, size = 3.5, position = position_dodge(0.9)) +
    labs(x = "", y = "Average Ride Length (Minutes)", title = "Average Ride Length by Time of Day", subtitle = "") + 
    theme_classic() +
    theme(legend.title = element_blank())

When visualizing ride length by time of day we can see that casual riders ride longer on average throughout the day but also that both groups ride longest during the afternoon while taking the shortest rides on average at night.

Where are they riding?

Ride density maps

# pepare data
stns <- data %>% 
  drop_na(start_station_name, start_station_lat, start_station_lng, member_casual) %>% 
  group_by(start_station_name, member_casual) %>%
  summarise(
    rides = n(), 
    start_station_lat = mean(start_station_lat, na.rm = T),
    start_station_lng = mean(start_station_lng, na.rm = T),
    .groups = 'drop'
  ) %>%
  arrange(desc(rides))

# set bounding box and import map
# bbox = c(top = 41.9825, right = -87.5, bottom = 41.75, left = -87.9002)
bbox = c(top = 42.05, right = -87.4, bottom = 41.65, left = -88)
import_map <- get_map(bbox, maptype = "terrain", source = "stamen", zoom = 11)
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# plot map
ggmap(import_map) +
  geom_density_2d(data = stns, mapping = aes(x = start_station_lng, y = start_station_lat), color = "black", alpha = 0.65, size = 1) +
  labs(title = "Ride density", caption = "", x = "", y = "") +
  facet_wrap(vars(member_casual)) + 
  theme(
    axis.text.x = element_blank(),
    axis.text.y = element_blank()
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 16 rows containing non-finite values (`stat_density2d()`).

Ride density comparisons illustrate a similar ride density distribution pattern for both members and casuals throughout the city overall. While members show a higher density of ridership in the downtown area, casuals show higher densities of ridership in some portions of the south side of Chicago.

Top 10 starting stations for Members and Casuals

# top member stations
top_stations_members <- data %>% 
  select(member_casual, start_station_name, start_station_lat, start_station_lng) %>% 
  drop_na(start_station_name) %>% 
  filter(member_casual == "Member") %>%
  group_by(member_casual, start_station_name, start_station_lat, start_station_lng) %>% 
  summarise(
    Rides = n(), 
    .groups = 'drop'

  ) %>%
  arrange(desc(Rides)) %>% 
  head(10)

# top casual stations
top_stations_casuals <- data %>% 
  select(member_casual, start_station_name, start_station_lat, start_station_lng) %>% 
  drop_na(start_station_name) %>% 
  filter(member_casual == "Casual") %>%
  group_by(member_casual,start_station_name, start_station_lat, start_station_lng) %>% 
  summarise(
    Rides = n(), 
    .groups = 'drop'
  ) %>%
  arrange(desc(Rides)) %>% 
  head(10)


# set bounding box and import map
bbox = c(top = 41.9825, right = -87.5, bottom = 41.75, left = -87.9002)
import_map <- get_map(bbox, maptype = "terrain", source = "stamen", zoom = 11)
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# plot map
ggmap(import_map) +
  geom_point(data = top_stations_members, mapping = aes(x = start_station_lng, y = start_station_lat, size = Rides, color = member_casual), alpha = 0.75) +
  geom_point(data = top_stations_casuals, mapping = aes(x = start_station_lng, y = start_station_lat, size = Rides, color = member_casual), alpha = 0.75) +
  labs(title = "Top 10 starting stations for members and casuals", caption = "") + 
  scale_color_manual(values=c(color_casual, color_member), name="Rider Type") +
  theme_map() +
  theme(plot.title = element_text(size = 13.5))

A look at the most popular stations reveals key differences in behavior between members and casuals. While the top ten start stations among casual riders hugs the Lake Michigan coastline, the most popular stations for members are just a few blocks to the west. Pair this pattern along with the weekly riding patterns of each group and we start to see a key trend develop:

Casual riders are most active on the weekend near Chicago’s coastal parks while member riders tend to be most active during the workweek just a few blocks to the west.

Top 10 routes most frequently traveled by members

data %>% 
  select(start_station_name, end_station_name, member_casual) %>% 
  drop_na(start_station_name, end_station_name, member_casual) %>% 
  filter(member_casual == "Member", !start_station_name == "", !end_station_name == "", !member_casual == "") %>%
  mutate(route = paste(start_station_name, "  -\n", end_station_name)) %>% 
  count(subscription = member_casual, route) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  ggplot() +
  geom_bar(aes(x = reorder(route, n), y = n), fill = color_member, stat = "identity") +
  coord_flip() +
  labs(x = "", y = "Rides", title = "Most frequently traveled routes for members", subtitle = "") + 
  theme_classic() +
  theme(legend.title = element_blank())

Map of top 10 routes most frequently traveled by Members

# popular Routes
popular_routes <- data %>% 
  select(start_station_name, end_station_name, member_casual, start_station_lat, start_station_lng, end_station_lat, end_station_lng) %>% 
  drop_na(start_station_name, end_station_name, member_casual) %>% 
  filter(member_casual == "Member") %>%
  mutate(route = paste(start_station_name, "  - \n", end_station_name)) %>% 
  count(subscription = member_casual, route, start_station_name, start_station_lat, start_station_lng, end_station_name, end_station_lat, end_station_lng) %>% 
  arrange(desc(n)) %>% 
  head(10)

# set bounding box and import map
bbox = c(top = 41.95, right = -87.35, bottom = 41.72, left = -87.7275)
import_map <- get_map(bbox, maptype = "terrain", source = "stamen", zoom = 11)
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# plot map
ggmap(import_map) +

  # popular routes map
  geom_point(data = popular_routes, mapping = aes(x = start_station_lng, y = start_station_lat, size = n), color = color_member, alpha = 0.45) +
  geom_point(data = popular_routes, mapping = aes(x = end_station_lng, y = end_station_lat, size = n), color = color_member, alpha = 0.1) +

  geom_text(mapping = aes(x = -87.5, y = 41.794), label = "University of Chicago", size = 4, color = "#333333") +
  geom_text(mapping = aes(x = -87.515, y = 41.8358), label = "Illinois Inst of Technology", size = 4, color = "#333333") +
  geom_text(mapping = aes(x = -87.545, y = 41.871), label = "Univ of Illinois at Chicago", size = 4, color = "#333333") +

  # facet_wrap(vars(member_casual)) +
  labs(title = "Routes most frequently traveled by members", caption = "") + 
  theme_map() +
  theme(plot.title = element_text(size = 13.5)) 

The most frequently traveled routes by members occur on or near college campuses.

Top 10 routes most frequently traveled by casuals

data %>% 
  select(start_station_name, end_station_name, member_casual) %>% 
  drop_na(start_station_name, end_station_name, member_casual) %>% 
  filter(member_casual == "Casual", !start_station_name == "", !end_station_name == "", !member_casual == "") %>%
  mutate(route = paste(start_station_name, "  - \n", end_station_name)) %>% 
  count(subscription = member_casual, route) %>% 
  arrange(desc(n)) %>% 
  head(10) %>%
  ggplot() +
  geom_bar(aes(x = reorder(route, n), y = n), fill = color_casual, stat = "identity") +
  coord_flip() +
  labs(x = "", y = "Rides", title = "Most frequently traveled routes for Casuals", subtitle = "") + 
  theme_classic() +
  theme(legend.title = element_blank())

Top 10 most frequently traveled routes by Casuals

# popular Routes
popular_routes <- data %>% 
  select(start_station_name, end_station_name, member_casual, start_station_lat, start_station_lng, end_station_lat, end_station_lng) %>% 
  drop_na(start_station_name, end_station_name, member_casual) %>% 
  filter(member_casual == "Casual") %>%
  mutate(route = paste(start_station_name, "  - \n", end_station_name)) %>% 
  count(subscription = member_casual, route, start_station_name, start_station_lat, start_station_lng, end_station_name, end_station_lat, end_station_lng) %>% 
  arrange(desc(n)) %>% 
  head(10)

bbox = c(top = 41.9825, right = -87.5, bottom = 41.75, left = -87.9002)
import_map <- get_map(bbox, maptype = "terrain", source = "stamen", zoom = 11)
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# plot map
ggmap(import_map) +

  # popular routes map
  geom_point(data = popular_routes, mapping = aes(x = start_station_lng, y = start_station_lat, size = n), color = color_member, alpha = 0.45) +
  geom_point(data = popular_routes, mapping = aes(x = end_station_lng, y = end_station_lat, size = n), color = color_member, alpha = 0.1) +
  labs(title = "Most frequently traveled routes by casuals", caption = "") + 
  theme_map() +
  theme(plot.title = element_text(size = 13.5))

The most frequently traveled routes for casual riders are clustered in and around Chicago’s coastal parks.

Conclusion

  1. Casual rides are fewer than member rides and last more than 8 minutes longer per ride on average than member rides.

  2. Casual riders ride more often on the weekends near Chicago’s coastal parks while member riders tend to be most active during the workweek several blocks from the shoreline.

  3. The most frequently traveled routes by members occur on and around college campuses while the most frequently traveled routes for casual riders are clustered in and around Chicago’s coastal parks.

Top 3 Recommendations

  1. Encourage casual riders to convert to an annual membership by highlighting the cost savings involved in riding more often. Cyclistic’s network of 1000+ bike stations makes it easy to ride for a variety of tasks such as recreation, school and work - anywhere in the Chicago area.

  2. Target advertising toward high densities of casual riders in Chicago’s coastal parks and ramp up advertising during the month of March when the anticipation of increased ridership is high.

  3. Gather more data by conducting a survey. Ask cyclistic riders: Why have you chosen Cyclistic for your mobility needs? What tasks do you perform when using Cylistic bikes? What do you like or dislike about Cyclistic?